1 Exploratory Data Analysis (EDA)

1.1 Looking at the raw values

listings %>%
glimpse()
Rows: 6,296
Columns: 74
$ id                                           <dbl> 2797791, 4990531, 6619374…
$ listing_url                                  <chr> "https://www.airbnb.com/r…
$ scrape_id                                    <dbl> 2.021093e+13, 2.021093e+1…
$ last_scraped                                 <date> 2021-09-29, 2021-09-28, …
$ name                                         <chr> "Beijing Great Wall Escap…
$ description                                  <chr> "A perfect escape only 2 …
$ neighborhood_overview                        <chr> "Located in a small villa…
$ picture_url                                  <chr> "https://a0.muscache.com/…
$ host_id                                      <dbl> 14311129, 25729513, 34492…
$ host_url                                     <chr> "https://www.airbnb.com/u…
$ host_name                                    <chr> "Andrew", "Joel", "ä¹ęž—",…
$ host_since                                   <date> 2014-04-15, 2015-01-07, …
$ host_location                                <chr> "Beijing, Beijing, China"…
$ host_about                                   <chr> "Been living in Beijing f…
$ host_response_time                           <chr> "within a few hours", "wi…
$ host_response_rate                           <chr> "100%", "100%", "N/A", "1…
$ host_acceptance_rate                         <chr> "73%", "99%", "N/A", "100…
$ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FALS…
$ host_thumbnail_url                           <chr> "https://a0.muscache.com/…
$ host_picture_url                             <chr> "https://a0.muscache.com/…
$ host_neighbourhood                           <chr> NA, "Shichahai", NA, NA, …
$ host_listings_count                          <dbl> 1, 10, 2, 1, 3, 1, 5, 5, …
$ host_total_listings_count                    <dbl> 1, 10, 2, 1, 3, 1, 5, 5, …
$ host_verifications                           <chr> "['email', 'phone', 'revi…
$ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, T…
$ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE, T…
$ neighbourhood                                <chr> "Beijing, China", "Beijin…
$ neighbourhood_cleansed                       <chr> "ę€€ęŸ”åŒŗ / Huairou", "äøœåŸŽā€¦
$ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, N…
$ latitude                                     <dbl> 40.47329, 39.94193, 40.44…
$ longitude                                    <dbl> 116.5451, 116.3984, 116.0…
$ property_type                                <chr> "Entire residential home"…
$ room_type                                    <chr> "Entire home/apt", "Entir…
$ accommodates                                 <dbl> 10, 4, 15, 16, 12, 16, 12…
$ bathrooms                                    <lgl> NA, NA, NA, NA, NA, NA, N…
$ bathrooms_text                               <chr> "1 bath", "1 bath", "4 ba…
$ bedrooms                                     <dbl> 3, 1, 4, 1, 4, 5, 2, 4, 4…
$ beds                                         <dbl> 3, 2, 4, 2, 5, 9, 11, 12,…
$ amenities                                    <chr> "[\"Dishes and silverware…
$ price                                        <chr> "$1,914.00", "$1,610.00",…
$ minimum_nights                               <dbl> 1, 29, 1, 1, 1, 1, 1, 1, …
$ maximum_nights                               <dbl> 1125, 365, 1125, 1125, 11…
$ minimum_minimum_nights                       <dbl> 1, 29, 1, 1, 1, 1, 1, 1, …
$ maximum_minimum_nights                       <dbl> 1, 29, 1, 1, 1, 1, 1, 1, …
$ minimum_maximum_nights                       <dbl> 1125, 1125, 1125, 1125, 1…
$ maximum_maximum_nights                       <dbl> 1125, 1125, 1125, 1125, 1…
$ minimum_nights_avg_ntm                       <dbl> 1, 29, 1, 1, 1, 1, 1, 1, …
$ maximum_nights_avg_ntm                       <dbl> 1125, 1125, 1125, 1125, 1…
$ calendar_updated                             <lgl> NA, NA, NA, NA, NA, NA, N…
$ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, T…
$ availability_30                              <dbl> 20, 0, 25, 30, 24, 26, 24…
$ availability_60                              <dbl> 50, 0, 29, 60, 27, 56, 54…
$ availability_90                              <dbl> 53, 0, 29, 90, 27, 86, 84…
$ availability_365                             <dbl> 234, 118, 29, 365, 298, 3…
$ calendar_last_scraped                        <date> 2021-09-29, 2021-09-28, …
$ number_of_reviews                            <dbl> 56, 20, 3, 1, 2, 14, 61, …
$ number_of_reviews_ltm                        <dbl> 3, 0, 0, 0, 1, 3, 15, 5, …
$ number_of_reviews_l30d                       <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ first_review                                 <date> 2015-05-04, 2016-12-31, …
$ last_review                                  <date> 2019-05-11, 2020-07-17, …
$ review_scores_rating                         <dbl> 4.63, 4.68, 5.00, 0.00, 5…
$ review_scores_accuracy                       <dbl> 4.72, 4.71, 5.00, NA, 5.0…
$ review_scores_cleanliness                    <dbl> 4.24, 4.82, 4.67, NA, 5.0…
$ review_scores_checkin                        <dbl> 4.89, 5.00, 5.00, NA, 5.0…
$ review_scores_communication                  <dbl> 4.92, 4.88, 5.00, NA, 5.0…
$ review_scores_location                       <dbl> 4.91, 4.88, 5.00, NA, 5.0…
$ review_scores_value                          <dbl> 4.30, 4.76, 4.33, NA, 5.0…
$ license                                      <lgl> NA, NA, NA, NA, NA, NA, N…
$ instant_bookable                             <lgl> FALSE, TRUE, FALSE, TRUE,…
$ calculated_host_listings_count               <dbl> 1, 10, 1, 1, 3, 1, 1, 2, …
$ calculated_host_listings_count_entire_homes  <dbl> 1, 6, 1, 0, 3, 1, 1, 2, 3…
$ calculated_host_listings_count_private_rooms <dbl> 0, 4, 0, 1, 0, 0, 0, 0, 0…
$ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ reviews_per_month                            <dbl> 0.72, 0.35, 0.11, 0.02, 0…

1.2 Summary statistics

listings %>%
skim()
Data summary
Name Piped data
Number of rows 6296
Number of columns 74
_______________________
Column type frequency:
character 23
Date 5
logical 9
numeric 37
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 36 37 0 6296 0
name 0 1.00 1 185 0 6117 0
description 265 0.96 2 1000 0 5166 0
neighborhood_overview 1139 0.82 2 1000 0 3790 0
picture_url 0 1.00 63 112 0 6032 0
host_url 0 1.00 41 43 0 2665 0
host_name 0 1.00 1 41 0 2299 0
host_location 4 1.00 2 40 0 41 0
host_about 3332 0.47 1 4820 0 1063 1
host_response_time 0 1.00 3 18 0 5 0
host_response_rate 0 1.00 2 4 0 37 0
host_acceptance_rate 0 1.00 2 4 0 46 0
host_thumbnail_url 0 1.00 55 106 0 2658 0
host_picture_url 0 1.00 57 109 0 2658 0
host_neighbourhood 5938 0.06 2 25 0 28 0
host_verifications 0 1.00 2 151 0 99 0
neighbourhood 1139 0.82 14 34 0 8 0
neighbourhood_cleansed 0 1.00 3 16 0 16 0
property_type 0 1.00 3 35 0 80 0
room_type 0 1.00 11 15 0 3 0
bathrooms_text 5 1.00 6 17 0 70 0
amenities 0 1.00 27 1206 0 5199 0
price 0 1.00 6 10 0 3029 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2021-09-28 2021-09-29 2021-09-28 2
host_since 0 1.00 2013-02-06 2021-09-19 2019-05-21 1370
calendar_last_scraped 0 1.00 2021-09-28 2021-09-29 2021-09-28 2
first_review 3203 0.49 2015-05-04 2021-09-28 2020-07-18 843
last_review 3203 0.49 2016-04-04 2021-09-28 2021-04-30 730

Variable type: logical

skim_variable n_missing complete_rate mean count
host_is_superhost 0 1 0.24 FAL: 4811, TRU: 1485
host_has_profile_pic 0 1 1.00 TRU: 6288, FAL: 8
host_identity_verified 0 1 1.00 TRU: 6279, FAL: 17
neighbourhood_group_cleansed 6296 0 NaN :
bathrooms 6296 0 NaN :
calendar_updated 6296 0 NaN :
has_availability 0 1 1.00 TRU: 6296
license 6296 0 NaN :
instant_bookable 0 1 0.65 TRU: 4111, FAL: 2185

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 4.068584e+07 9018880.17 2.797791e+06 3.513261e+07 4.309471e+07 4.883673e+07 5.245929e+07 ▁▁▂▅▇
scrape_id 0 1.00 2.021093e+13 0.00 2.021093e+13 2.021093e+13 2.021093e+13 2.021093e+13 2.021093e+13 ▁▁▇▁▁
host_id 0 1.00 2.545746e+08 106847611.68 4.984459e+06 1.828459e+08 2.631712e+08 3.492971e+08 4.236643e+08 ▂▅▆▇▇
host_listings_count 0 1.00 7.470000e+00 14.76 0.000000e+00 1.000000e+00 5.000000e+00 9.000000e+00 2.570000e+02 ▇▁▁▁▁
host_total_listings_count 0 1.00 7.470000e+00 14.76 0.000000e+00 1.000000e+00 5.000000e+00 9.000000e+00 2.570000e+02 ▇▁▁▁▁
latitude 0 1.00 4.031000e+01 0.30 3.947000e+01 4.019000e+01 4.041000e+01 4.050000e+01 4.095000e+01 ▁▂▂▇▁
longitude 0 1.00 1.164300e+02 0.47 1.154400e+02 1.160200e+02 1.164200e+02 1.167000e+02 1.175000e+02 ā–‚ā–†ā–‡ā–ƒā–ƒ
accommodates 0 1.00 7.100000e+00 5.11 1.000000e+00 2.000000e+00 5.000000e+00 1.200000e+01 1.600000e+01 ā–‡ā–‚ā–‚ā–‚ā–ƒ
bedrooms 61 0.99 3.060000e+00 2.49 1.000000e+00 1.000000e+00 2.000000e+00 5.000000e+00 2.500000e+01 ▇▂▁▁▁
beds 19 1.00 4.310000e+00 4.33 0.000000e+00 1.000000e+00 3.000000e+00 6.000000e+00 7.100000e+01 ▇▁▁▁▁
minimum_nights 0 1.00 1.380000e+00 9.33 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_nights 0 1.00 8.738100e+02 418.94 1.000000e+00 3.650000e+02 1.125000e+03 1.125000e+03 1.125000e+03 ▂▂▁▁▇
minimum_minimum_nights 0 1.00 1.360000e+00 9.30 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 1.540000e+00 15.67 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 9.331500e+02 378.79 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
maximum_maximum_nights 0 1.00 9.353200e+02 377.05 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
minimum_nights_avg_ntm 0 1.00 1.410000e+00 9.98 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 9.349400e+02 376.98 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
availability_30 0 1.00 1.917000e+01 9.82 0.000000e+00 1.600000e+01 2.400000e+01 2.500000e+01 3.000000e+01 ▅▁▁▇▇
availability_60 0 1.00 4.524000e+01 16.59 0.000000e+00 3.500000e+01 5.300000e+01 5.500000e+01 6.000000e+01 ▁▁▂▁▇
availability_90 0 1.00 7.160000e+01 24.32 0.000000e+00 6.300000e+01 8.300000e+01 8.500000e+01 9.000000e+01 ▁▁▁▂▇
availability_365 0 1.00 2.493600e+02 126.21 0.000000e+00 1.530000e+02 3.370000e+02 3.590000e+02 3.650000e+02 ▂▂▂▁▇
number_of_reviews 0 1.00 3.300000e+00 11.44 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00 4.600000e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 1.450000e+00 4.37 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 9.700000e+01 ▇▁▁▁▁
number_of_reviews_l30d 0 1.00 1.300000e-01 0.64 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.600000e+01 ▇▁▁▁▁
review_scores_rating 3203 0.49 4.670000e+00 1.00 0.000000e+00 4.840000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_accuracy 3312 0.47 4.900000e+00 0.37 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_cleanliness 3312 0.47 4.870000e+00 0.39 1.000000e+00 4.920000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_checkin 3312 0.47 4.910000e+00 0.36 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_communication 3312 0.47 4.920000e+00 0.35 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_location 3312 0.47 4.860000e+00 0.38 1.000000e+00 4.890000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_value 3312 0.47 4.800000e+00 0.48 1.000000e+00 4.800000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 5.910000e+00 6.12 1.000000e+00 1.000000e+00 4.000000e+00 8.000000e+00 3.300000e+01 ▇▂▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 2.350000e+00 3.83 0.000000e+00 0.000000e+00 1.000000e+00 2.000000e+00 3.100000e+01 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 3.490000e+00 5.33 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 3.300000e+01 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 7.000000e-02 0.64 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 9.000000e+00 ▇▁▁▁▁
reviews_per_month 3203 0.49 5.200000e-01 0.84 2.000000e-02 1.100000e-01 2.600000e-01 6.100000e-01 1.600000e+01 ▇▁▁▁▁

Beijing has 6296 accomodations listed on Airbnb.

1.3 Data wrangling

Since price is a quantitative variable, we need to make sure it is stored as numeric data num in the dataframe.

listings <- listings %>% 
  mutate(price = parse_number(price))
typeof(listings$price)
[1] "double"
skim(listings)
Data summary
Name listings
Number of rows 6296
Number of columns 74
_______________________
Column type frequency:
character 22
Date 5
logical 9
numeric 38
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 36 37 0 6296 0
name 0 1.00 1 185 0 6117 0
description 265 0.96 2 1000 0 5166 0
neighborhood_overview 1139 0.82 2 1000 0 3790 0
picture_url 0 1.00 63 112 0 6032 0
host_url 0 1.00 41 43 0 2665 0
host_name 0 1.00 1 41 0 2299 0
host_location 4 1.00 2 40 0 41 0
host_about 3332 0.47 1 4820 0 1063 1
host_response_time 0 1.00 3 18 0 5 0
host_response_rate 0 1.00 2 4 0 37 0
host_acceptance_rate 0 1.00 2 4 0 46 0
host_thumbnail_url 0 1.00 55 106 0 2658 0
host_picture_url 0 1.00 57 109 0 2658 0
host_neighbourhood 5938 0.06 2 25 0 28 0
host_verifications 0 1.00 2 151 0 99 0
neighbourhood 1139 0.82 14 34 0 8 0
neighbourhood_cleansed 0 1.00 3 16 0 16 0
property_type 0 1.00 3 35 0 80 0
room_type 0 1.00 11 15 0 3 0
bathrooms_text 5 1.00 6 17 0 70 0
amenities 0 1.00 27 1206 0 5199 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2021-09-28 2021-09-29 2021-09-28 2
host_since 0 1.00 2013-02-06 2021-09-19 2019-05-21 1370
calendar_last_scraped 0 1.00 2021-09-28 2021-09-29 2021-09-28 2
first_review 3203 0.49 2015-05-04 2021-09-28 2020-07-18 843
last_review 3203 0.49 2016-04-04 2021-09-28 2021-04-30 730

Variable type: logical

skim_variable n_missing complete_rate mean count
host_is_superhost 0 1 0.24 FAL: 4811, TRU: 1485
host_has_profile_pic 0 1 1.00 TRU: 6288, FAL: 8
host_identity_verified 0 1 1.00 TRU: 6279, FAL: 17
neighbourhood_group_cleansed 6296 0 NaN :
bathrooms 6296 0 NaN :
calendar_updated 6296 0 NaN :
has_availability 0 1 1.00 TRU: 6296
license 6296 0 NaN :
instant_bookable 0 1 0.65 TRU: 4111, FAL: 2185

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 4.068584e+07 9018880.17 2.797791e+06 3.513261e+07 4.309471e+07 4.883673e+07 5.245929e+07 ▁▁▂▅▇
scrape_id 0 1.00 2.021093e+13 0.00 2.021093e+13 2.021093e+13 2.021093e+13 2.021093e+13 2.021093e+13 ▁▁▇▁▁
host_id 0 1.00 2.545746e+08 106847611.68 4.984459e+06 1.828459e+08 2.631712e+08 3.492971e+08 4.236643e+08 ▂▅▆▇▇
host_listings_count 0 1.00 7.470000e+00 14.76 0.000000e+00 1.000000e+00 5.000000e+00 9.000000e+00 2.570000e+02 ▇▁▁▁▁
host_total_listings_count 0 1.00 7.470000e+00 14.76 0.000000e+00 1.000000e+00 5.000000e+00 9.000000e+00 2.570000e+02 ▇▁▁▁▁
latitude 0 1.00 4.031000e+01 0.30 3.947000e+01 4.019000e+01 4.041000e+01 4.050000e+01 4.095000e+01 ▁▂▂▇▁
longitude 0 1.00 1.164300e+02 0.47 1.154400e+02 1.160200e+02 1.164200e+02 1.167000e+02 1.175000e+02 ā–‚ā–†ā–‡ā–ƒā–ƒ
accommodates 0 1.00 7.100000e+00 5.11 1.000000e+00 2.000000e+00 5.000000e+00 1.200000e+01 1.600000e+01 ā–‡ā–‚ā–‚ā–‚ā–ƒ
bedrooms 61 0.99 3.060000e+00 2.49 1.000000e+00 1.000000e+00 2.000000e+00 5.000000e+00 2.500000e+01 ▇▂▁▁▁
beds 19 1.00 4.310000e+00 4.33 0.000000e+00 1.000000e+00 3.000000e+00 6.000000e+00 7.100000e+01 ▇▁▁▁▁
price 0 1.00 2.417430e+03 2676.84 5.900000e+01 6.190000e+02 1.555000e+03 3.511500e+03 6.399500e+04 ▇▁▁▁▁
minimum_nights 0 1.00 1.380000e+00 9.33 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_nights 0 1.00 8.738100e+02 418.94 1.000000e+00 3.650000e+02 1.125000e+03 1.125000e+03 1.125000e+03 ▂▂▁▁▇
minimum_minimum_nights 0 1.00 1.360000e+00 9.30 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 1.540000e+00 15.67 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 9.331500e+02 378.79 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
maximum_maximum_nights 0 1.00 9.353200e+02 377.05 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
minimum_nights_avg_ntm 0 1.00 1.410000e+00 9.98 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.650000e+02 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 9.349400e+02 376.98 1.000000e+00 1.125000e+03 1.125000e+03 1.125000e+03 1.125000e+03 ▁▁▁▁▇
availability_30 0 1.00 1.917000e+01 9.82 0.000000e+00 1.600000e+01 2.400000e+01 2.500000e+01 3.000000e+01 ▅▁▁▇▇
availability_60 0 1.00 4.524000e+01 16.59 0.000000e+00 3.500000e+01 5.300000e+01 5.500000e+01 6.000000e+01 ▁▁▂▁▇
availability_90 0 1.00 7.160000e+01 24.32 0.000000e+00 6.300000e+01 8.300000e+01 8.500000e+01 9.000000e+01 ▁▁▁▂▇
availability_365 0 1.00 2.493600e+02 126.21 0.000000e+00 1.530000e+02 3.370000e+02 3.590000e+02 3.650000e+02 ▂▂▂▁▇
number_of_reviews 0 1.00 3.300000e+00 11.44 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00 4.600000e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 1.450000e+00 4.37 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 9.700000e+01 ▇▁▁▁▁
number_of_reviews_l30d 0 1.00 1.300000e-01 0.64 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.600000e+01 ▇▁▁▁▁
review_scores_rating 3203 0.49 4.670000e+00 1.00 0.000000e+00 4.840000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_accuracy 3312 0.47 4.900000e+00 0.37 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_cleanliness 3312 0.47 4.870000e+00 0.39 1.000000e+00 4.920000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_checkin 3312 0.47 4.910000e+00 0.36 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_communication 3312 0.47 4.920000e+00 0.35 1.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_location 3312 0.47 4.860000e+00 0.38 1.000000e+00 4.890000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
review_scores_value 3312 0.47 4.800000e+00 0.48 1.000000e+00 4.800000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 5.910000e+00 6.12 1.000000e+00 1.000000e+00 4.000000e+00 8.000000e+00 3.300000e+01 ▇▂▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 2.350000e+00 3.83 0.000000e+00 0.000000e+00 1.000000e+00 2.000000e+00 3.100000e+01 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 3.490000e+00 5.33 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 3.300000e+01 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 7.000000e-02 0.64 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 9.000000e+00 ▇▁▁▁▁
reviews_per_month 3203 0.49 5.200000e-01 0.84 2.000000e-02 1.100000e-01 2.600000e-01 6.100000e-01 1.600000e+01 ▇▁▁▁▁

1.4 Data visualisations

# Price (per bedroom) distribution by room type
listings %>%
  filter (!is.na(room_type)) %>%
  mutate ( price_per_bedroom := price/bedrooms ) %>%
  ggplot(aes(x=price_per_bedroom, colour = room_type, alpha=0.4)) +
  geom_histogram() +
  facet_wrap(~room_type)+
  theme_bw() +
  labs (title = "Price Distribution by Room Type")

Most of the accommodations available are Entire home/apartment. One very simple reason for that is that sharing rooms in Beijing is quite uncommon, although in this graph we don’t see a huge disparity in the price between these different types of accommodation since all distributions are skewed to the right.

# Box plot of price per bedroom by neighbourhoods
listings %>%
  filter (!is.na(neighbourhood_cleansed)) %>%
  mutate ( price_per_bedroom := price/bedrooms ) %>%
ggplot(aes( x = factor(neighbourhood_cleansed))) +
  geom_boxplot(aes(y = price_per_bedroom)) +
  theme( axis.text.x = element_text( angle= 45, hjust = 1)) +  
  scale_y_continuous(limits = c(0,2500)) +
  labs(title = "Box Plot of Price per Bedroom by Neighbourhoods")

listings_1 <- listings %>%
  count(neighbourhood_cleansed)

listings_1 %>% 
  slice_max(order_by = n, n=25) %>% 
  ggplot(aes(x = n, y = fct_reorder(neighbourhood_cleansed, n))) +
  geom_col() +
  labs(
    title = "Neighbourhood ranked by number of listings",
    x = "Number of Listings",
    y = "Neighbourhood")

From the mapping, we have figured out Airbnb listings in Beijing are concentrated in city centre and the northern region around HuaiRou District. The popularity is explained by the convenience to commute in the centre of Beijing and the costly expense for an alternative means of lodging such as hotel. Meanwhile, there are so many scenic spots on the northern side far from city centres (around 2 hours of commuting) and the properties are more appealing with nice views. So tourists prefer to stay at Airbnb if they want to make a visit to these spots.

# Correlation matrix of key variables
listings <- listings %>% 
  mutate(log_price := log(price)) # Mutate a new column showing log price
ggpairs(listings, columns = c("log_price",  "accommodates",  "bedrooms", "availability_30", "availability_60", "review_scores_rating", "beds", "number_of_reviews", "minimum_nights"))

The highest correlations seem to be between beds, bedrooms and accomodates which is obvious. Another interesting but logical correlation is between price and accomodates: a price for an apartment that can accomodate 8 people is logically more expensive than one that accomodates only 2 people.

# Box plot of price per bedroom by whether the host is super host
listings %>%
  filter (!is.na(host_is_superhost)) %>%
  mutate ( price_per_bedroom := price/bedrooms ) %>%
ggplot(aes( x = factor(host_is_superhost))) +
  geom_boxplot(aes(y = price_per_bedroom)) +
  theme( axis.text.x = element_text( angle= 45, hjust = 1)) +  
  scale_y_continuous(limits = c(0,2500)) +
  labs(title = "Box Plot of Price per Bedroom by the host")

The second one is box plots describing the relationship between prices and whether hosts are super hosts. We can see a small difference, but surprisingly not that big, meaning that being a superhost doesn’t not increase demand for their accommodation that much doesn’t seem to impact the prices. But we’ll try to confirm or deny that later on in the regression analysis.

1.5 Propery types

listings <- listings %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c("Entire villa","Entire residential home", "Farm stay","Private room in farm stay") ~ property_type, 
    TRUE ~ "Other"
  ))

The most common property types are Entire VIlla, Entire residnetial home, Farm stay, and Private room in farm stay which is coherent with the first graph we plotted.

listings %>%
  count(property_type, prop_type_simplified) %>%
  arrange(desc(n))        
property_typeprop_type_simplifiedn
Entire villaEntire villa812
Entire residential homeEntire residential home620
Farm stayFarm stay618
Private room in farm stayPrivate room in farm stay556
Entire cottageOther516
Private room in kezhanOther375
Private room in villaOther290
Room in boutique hotelOther279
Private room in residential homeOther273
Room in hotelOther233
Entire bungalowOther217
Private room in cottageOther207
Entire rental unitOther156
Entire townhouseOther135
Private room in bed and breakfastOther118
Private room in serviced apartmentOther84
Private room in resortOther80
Private room in bungalowOther68
Entire loftOther61
KezhanOther60
Private room in nature lodgeOther53
Private room in townhouseOther46
Entire cabinOther40
Private room in rental unitOther40
Shared room in hostelOther32
Entire condominium (condo)Other28
Entire serviced apartmentOther28
Private room in hostelOther26
Private roomOther23
Earth houseOther22
Room in aparthotelOther17
Private room in loftOther15
Private room in guesthouseOther14
Entire placeOther11
Entire chaletOther10
CampsiteOther9
Private room in earth houseOther7
Entire bed and breakfastOther6
Entire home/aptOther6
Private room in cabinOther6
Private room in caveOther5
Private room in guest suiteOther5
Private room in minsuOther5
BarnOther4
Entire guest suiteOther4
Private room in barnOther4
Private room in castleOther4
RanchOther4
Shared room in bed and breakfastOther4
Shared room in cottageOther4
Shared room in farm stayOther4
Shared room in kezhanOther4
Casa particularOther3
MinsuOther3
Private room in condominium (condo)Other3
Shared room in boutique hotelOther3
Tiny houseOther3
CastleOther2
Entire guesthouseOther2
Entire resortOther2
HutOther2
Private room in hutOther2
Private room in ranchOther2
Private room in tiny houseOther2
Private room in treehouseOther2
Shared room in rental unitOther2
Shared room in villaOther2
CaveOther1
Entire hostelOther1
Holiday parkOther1
HouseboatOther1
PensionOther1
Private room in camper/rvOther1
Private room in dome houseOther1
Private room in ryokanOther1
Private room in shipping containerOther1
RiadOther1
Shared room in earth houseOther1
Shared room in townhouseOther1
TreehouseOther1

Airbnb is most commonly used for travel purposes, i.e., as an alternative to traditional hotels. We only want to include listings in our regression analysis that are intended for travel purposes:

listings %>%
  count(minimum_nights)

minimum_nightsn
16197
244
38
41
52
75
108
152
297
3018
3601
3653
The most common value for minimum_nights is 1.

There are some unusual figures for minimum_nights such as, Airbnb does this to encourage customers to stay longer and spend more money.`

listings <- listings %>% 
  filter(minimum_nights <= 4)

2 Mapping

leaflet(data = filter(listings, minimum_nights <= 4)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

3 Regression Analysis

For the target variable \(Y\), we will use the cost for two people to stay at an Airbnb location for four (4) nights.

Create a new variable called price_4_nights that uses price, and accomodates to calculate the total cost for two people to stay at the Airbnb property for 4 nights. This is the variable \(Y\) we want to explain.

listings <- listings %>% 
  mutate(price_4_nights = (price/accommodates)*2*4) 

#First calculate the unit price then 4 nights then 2 people 

Use histograms or density plots to examine the distributions of price_4_nights and log(price_4_nights). Which variable should you use for the regression model? Why?

listings  %>% 
  ggplot()+
  geom_density(aes(x=price_4_nights)) 

listings %>% 
  ggplot()+
  geom_density(aes(x=log(price_4_nights)))

#Density distribution of raw price_4_nights and log(price_4_nights)

Linear regression assumes normal distributions for variables used. Since Log(price_4_nights) fits the normal distribution much better, it should be used instead of price_4_nights variable.

We fitted a regression model called model1 with the following explanatory variables: prop_type_simplified, number_of_reviews, and review_scores_rating.

model1 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating,data = listings)

msummary(model1)
                                                Estimate Std. Error t value
(Intercept)                                    7.6918165  0.0640998 119.998
prop_type_simplifiedEntire villa               0.1473064  0.0463454   3.178
prop_type_simplifiedFarm stay                 -0.2983630  0.0501030  -5.955
prop_type_simplifiedOther                     -0.0419324  0.0374138  -1.121
prop_type_simplifiedPrivate room in farm stay -0.8578970  0.0572267 -14.991
number_of_reviews                             -0.0012475  0.0007375  -1.692
review_scores_rating                           0.0125218  0.0115063   1.088
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa                0.0015 ** 
prop_type_simplifiedFarm stay                 2.89e-09 ***
prop_type_simplifiedOther                       0.2625    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                               0.0908 .  
review_scores_rating                            0.2766    

Residual standard error: 0.6338 on 3074 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1125,    Adjusted R-squared:  0.1108 
F-statistic: 64.93 on 6 and 3074 DF,  p-value: < 2.2e-16

ā€œreview_scores_ratingā€ is not a significant indicator of log(price_4_nights) since its P values is 0.27 which is way higher than the 0.05 threshold.

Since ā€œprop_type_simplifiedā€ variable is categorical we need to interpret every type of properties:

  • the coefficient of ā€œOtherā€ is not significant (tvalue < 2)
  • the ā€œEntire Residential Homeā€ is the base category and it has significant an intercept of 7.7 which implies that this type of property drives the price of the accommodation positively by log(usd) 7.7.
  • the property type ā€œEntire Villaā€ has a significant intercept of 0.14. By adding the intercept of the base category we obtain an intercept of 7.83 which implies that this type of properties drives the price of the accommodation by a relative move of log(usd) 7.83.
  • the property type ā€œFarm stayā€ has a absolute significant intercept of 7.392 which implies that this type of properties drives the price of the accommodation by a relative move of log(usd) 7.39.
  • the property type ā€œPrivate room in a farm stayā€ has an absolute significant intercept of 6.834 which implies that this type of property drives the price by a relative move of log(usd) 6.834

We want to determine if room_type is a significant predictor of the cost for 4 nights, given everything else in the model. .

model2 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type,data = listings)

msummary(model2)
                                                Estimate Std. Error t value
(Intercept)                                    7.7010521  0.0635534 121.175
prop_type_simplifiedEntire villa               0.1471316  0.0459346   3.203
prop_type_simplifiedFarm stay                 -0.2981248  0.0496591  -6.003
prop_type_simplifiedOther                      0.0723514  0.0404026   1.791
prop_type_simplifiedPrivate room in farm stay -0.6563589  0.0640321 -10.250
number_of_reviews                             -0.0010252  0.0007354  -1.394
review_scores_rating                           0.0102848  0.0114091   0.901
room_typePrivate room                         -0.2010158  0.0297561  -6.755
room_typeShared room                          -0.4593022  0.1047868  -4.383
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa               0.00137 ** 
prop_type_simplifiedFarm stay                 2.16e-09 ***
prop_type_simplifiedOther                      0.07343 .  
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                              0.16337    
review_scores_rating                           0.36741    
room_typePrivate room                         1.70e-11 ***
room_typeShared room                          1.21e-05 ***

Residual standard error: 0.6282 on 3072 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1287,    Adjusted R-squared:  0.1264 
F-statistic: 56.73 on 8 and 3072 DF,  p-value: < 2.2e-16

In the model2, we can observe that room_type is important for both ā€œprivate roomā€ and ā€œshared roomā€ categories since both of their p value is lower than 0.05.

3.1 Further variables/questions to explore on our own

  1. Are the number of bathrooms, bedrooms, beds, or size of the house (accomodates) significant predictors of price_4_nights? Or might these be co-linear variables?
model_3 <- lm(log(price_4_nights)~bedrooms+beds+accommodates,data = listings)

msummary(model_3)
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   7.760921   0.016737 463.704  < 2e-16 ***
bedrooms      0.056822   0.008474   6.705 2.19e-11 ***
beds         -0.026947   0.003826  -7.042 2.09e-12 ***
accommodates -0.018089   0.003684  -4.911 9.32e-07 ***

Residual standard error: 0.7622 on 6168 degrees of freedom
  (78 observations deleted due to missingness)
Multiple R-squared:  0.01607,   Adjusted R-squared:  0.01559 
F-statistic: 33.57 on 3 and 6168 DF,  p-value: < 2.2e-16
model_3 %>% 
  car::vif(model_3)
    bedrooms         beds accommodates 
    4.744730     2.926654     3.776660 

In Beijing data set ā€œbathroomsā€ variable is empty. Therefore, the analysis is done with ā€œbedroomsā€, ā€œbedsā€ and ā€œaccommodatesā€ variables. All these variables are found significant.

When we investigate VIFs, we see that none of them is higher than 5. Although, ā€œbedroomsā€ variable is close to 5 with a VIF value of 4.7. So these could potentially be co-linear variables. Therefore, we will add beds and accommodates but not the bedrooms. These variables will be added at the very end to not affect the collinearity in other models.

  1. Do superhosts (host_is_superhost) command a pricing premium, after controlling for other variables?
model_4 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost,data = listings)

msummary(model_4)
                                                Estimate Std. Error t value
(Intercept)                                    7.9396873  0.0683955 116.085
prop_type_simplifiedEntire villa               0.1747360  0.0454328   3.846
prop_type_simplifiedFarm stay                 -0.3087231  0.0489801  -6.303
prop_type_simplifiedOther                      0.0225057  0.0401533   0.560
prop_type_simplifiedPrivate room in farm stay -0.6753030  0.0631445 -10.695
number_of_reviews                             -0.0018659  0.0007367  -2.533
review_scores_rating                           0.0057109  0.0113171   0.505
room_typePrivate room                         -0.3008144  0.0313469  -9.596
room_typeShared room                          -0.5664130  0.1047953  -5.405
accommodates                                  -0.0234069  0.0026911  -8.698
host_is_superhostTRUE                          0.0844251  0.0243123   3.473
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa              0.000122 ***
prop_type_simplifiedFarm stay                 3.34e-10 ***
prop_type_simplifiedOther                     0.575183    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                             0.011364 *  
review_scores_rating                          0.613856    
room_typePrivate room                          < 2e-16 ***
room_typeShared room                          6.98e-08 ***
accommodates                                   < 2e-16 ***
host_is_superhostTRUE                         0.000523 ***

Residual standard error: 0.619 on 3070 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1543,    Adjusted R-squared:  0.1516 
F-statistic: 56.03 on 10 and 3070 DF,  p-value: < 2.2e-16

Yes, actually it is safe to say super hosts command a price premium since it is a significant variable in the model and its beta is positive.

  1. Some hosts allow you to immediately book their listing (instant_bookable == TRUE), while a non-trivial proportion don’t. After controlling for other variables, is instant_bookable a significant predictor of price_4_nights?
model_5 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable,data = listings)

msummary(model_5)
                                                Estimate Std. Error t value
(Intercept)                                    7.8873138  0.0693186 113.783
prop_type_simplifiedEntire villa               0.1725385  0.0453109   3.808
prop_type_simplifiedFarm stay                 -0.3091861  0.0488456  -6.330
prop_type_simplifiedOther                      0.0237491  0.0400440   0.593
prop_type_simplifiedPrivate room in farm stay -0.6645221  0.0630224 -10.544
number_of_reviews                             -0.0018775  0.0007347  -2.556
review_scores_rating                           0.0025309  0.0113109   0.224
room_typePrivate room                         -0.2997031  0.0312619  -9.587
room_typeShared room                          -0.5794905  0.1045527  -5.543
accommodates                                  -0.0236088  0.0026841  -8.796
host_is_superhostTRUE                          0.0733655  0.0243856   3.009
instant_bookableTRUE                           0.1031757  0.0243532   4.237
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa              0.000143 ***
prop_type_simplifiedFarm stay                 2.81e-10 ***
prop_type_simplifiedOther                     0.553175    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                             0.010649 *  
review_scores_rating                          0.822963    
room_typePrivate room                          < 2e-16 ***
room_typeShared room                          3.23e-08 ***
accommodates                                   < 2e-16 ***
host_is_superhostTRUE                         0.002646 ** 
instant_bookableTRUE                          2.34e-05 ***

Residual standard error: 0.6173 on 3069 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1593,    Adjusted R-squared:  0.1562 
F-statistic: 52.85 on 11 and 3069 DF,  p-value: < 2.2e-16
model_5 %>% 
  car::vif(model_5)
                         GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.748655  4        1.072354
number_of_reviews    1.062425  1        1.030740
review_scores_rating 1.024080  1        1.011969
room_type            1.903971  2        1.174668
accommodates         1.457615  1        1.207317
host_is_superhost    1.083429  1        1.040879
instant_bookable     1.024292  1        1.012073
  1. For all cities, there are 3 variables that relate to neighbourhoods: neighbourhood, neighbourhood_cleansed, and neighbourhood_group_cleansed. There are typically more than 20 neighbourhoods in each city, and it wouldn’t make sense to include them all in your model. Use your city knowledge, or ask someone with city knowledge, and see whether you can group neighbourhoods together so the majority of listings falls in fewer (5-6 max) geographical areas. You would thus need to create a new categorical variabale neighbourhood_simplified and determine whether location is a predictor of price_4_nights
#Reducing the categories for the neighbourhood
listings <- listings %>%
  mutate(neighbourhood_simplified = case_when(
    neighbourhood_cleansed %in% c("ę€€ęŸ”åŒŗ / Huairou","å»¶åŗ†åŽæ / Yanqing", "åÆ†äŗ‘åŽæ / Miyun") ~ neighbourhood_cleansed, 
    TRUE ~ "Other"
  ))


# Checking our results
listings %>%
  count(neighbourhood_cleansed, neighbourhood_simplified) %>%
  arrange(desc(n))        
neighbourhood_cleansedneighbourhood_simplifiedn
ę€€ęŸ”åŒŗ / Huairouę€€ęŸ”åŒŗ / Huairou1770
å»¶åŗ†åŽæ / Yanqingå»¶åŗ†åŽæ / Yanqing1550
åÆ†äŗ‘åŽæ / MiyunåÆ†äŗ‘åŽæ / Miyun976
房山区Other489
昌平区Other279
äøœåŸŽåŒŗOther278
平谷区 / PingguOther249
é—Øå¤“ę²ŸåŒŗ / MentougouOther140
é€šå·žåŒŗ / TongzhouOther136
ęœé˜³åŒŗ / ChaoyangOther94
č„æåŸŽåŒŗOther89
é”ŗä¹‰åŒŗ / ShunyiOther82
å¤§å…“åŒŗ / DaxingOther47
ęµ·ę·€åŒŗOther42
äø°å°åŒŗ / FengtaiOther27
ēŸ³ę™Æå±±åŒŗOther2
#Creating a 6th model
model_6 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified,data = listings)

msummary(model_6)
                                                Estimate Std. Error t value
(Intercept)                                    7.8380388  0.0697560 112.364
prop_type_simplifiedEntire villa               0.2105253  0.0448471   4.694
prop_type_simplifiedFarm stay                 -0.3002228  0.0481599  -6.234
prop_type_simplifiedOther                      0.0368615  0.0394976   0.933
prop_type_simplifiedPrivate room in farm stay -0.6404506  0.0627714 -10.203
number_of_reviews                             -0.0015691  0.0007292  -2.152
review_scores_rating                           0.0016299  0.0111431   0.146
room_typePrivate room                         -0.3137658  0.0312073 -10.054
room_typeShared room                          -0.5437272  0.1037815  -5.239
accommodates                                  -0.0265040  0.0026652  -9.945
host_is_superhostTRUE                          0.0774135  0.0240314   3.221
instant_bookableTRUE                           0.1045542  0.0241070   4.337
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun        -0.0490435  0.0340834  -1.439
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing       0.0195919  0.0307229   0.638
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou       0.2363871  0.0291593   8.107
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa              2.79e-06 ***
prop_type_simplifiedFarm stay                 5.17e-10 ***
prop_type_simplifiedOther                      0.35076    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                              0.03149 *  
review_scores_rating                           0.88372    
room_typePrivate room                          < 2e-16 ***
room_typeShared room                          1.72e-07 ***
accommodates                                   < 2e-16 ***
host_is_superhostTRUE                          0.00129 ** 
instant_bookableTRUE                          1.49e-05 ***
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun         0.15027    
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing       0.52372    
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou      7.44e-16 ***

Residual standard error: 0.6081 on 3066 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1849,    Adjusted R-squared:  0.1812 
F-statistic: 49.69 on 14 and 3066 DF,  p-value: < 2.2e-16
model_6 %>% 
  car::vif(model_6)
                             GVIF Df GVIF^(1/(2*Df))
prop_type_simplified     1.813047  4        1.077212
number_of_reviews        1.078582  1        1.038548
review_scores_rating     1.024223  1        1.012039
room_type                1.974884  2        1.185456
accommodates             1.480890  1        1.216918
host_is_superhost        1.084273  1        1.041284
instant_bookable         1.034293  1        1.017002
neighbourhood_simplified 1.139028  3        1.021933

The districts are divided into four according to the knowledge of the group members. These districts are Huairou, Yanqing, Miyun and others.

Only Huairou district is significant and it demands a premium on price. This makes sense since Huairou is a high quality district and quite popular on social media.

  1. What is the effect of avalability_30 or reviews_per_month on price_4_nights, after we control for other variables?
model_7 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified+availability_30+reviews_per_month,data = listings)

msummary(model_7)
                                                Estimate Std. Error t value
(Intercept)                                    7.7513655  0.0731219 106.006
prop_type_simplifiedEntire villa               0.2041332  0.0446815   4.569
prop_type_simplifiedFarm stay                 -0.2920707  0.0479909  -6.086
prop_type_simplifiedOther                      0.0381457  0.0393449   0.970
prop_type_simplifiedPrivate room in farm stay -0.6278441  0.0625751 -10.033
number_of_reviews                             -0.0034081  0.0008401  -4.057
review_scores_rating                           0.0008305  0.0111014   0.075
room_typePrivate room                         -0.3118790  0.0310819 -10.034
room_typeShared room                          -0.5505730  0.1038216  -5.303
accommodates                                  -0.0268431  0.0026568 -10.104
host_is_superhostTRUE                          0.0740687  0.0239481   3.093
instant_bookableTRUE                           0.0847914  0.0243592   3.481
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun        -0.0396342  0.0342567  -1.157
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing       0.0320826  0.0307525   1.043
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou       0.2508672  0.0292427   8.579
availability_30                                0.0036026  0.0012986   2.774
reviews_per_month                              0.0686400  0.0156278   4.392
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa              5.10e-06 ***
prop_type_simplifiedFarm stay                 1.30e-09 ***
prop_type_simplifiedOther                     0.332362    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                             5.10e-05 ***
review_scores_rating                          0.940372    
room_typePrivate room                          < 2e-16 ***
room_typeShared room                          1.22e-07 ***
accommodates                                   < 2e-16 ***
host_is_superhostTRUE                         0.002000 ** 
instant_bookableTRUE                          0.000507 ***
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun        0.247373    
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing      0.296913    
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou       < 2e-16 ***
availability_30                               0.005569 ** 
reviews_per_month                             1.16e-05 ***

Residual standard error: 0.6057 on 3064 degrees of freedom
  (3169 observations deleted due to missingness)
Multiple R-squared:  0.1921,    Adjusted R-squared:  0.1879 
F-statistic: 45.54 on 16 and 3064 DF,  p-value: < 2.2e-16
model_7 %>% 
  car::vif(model_7)
                             GVIF Df GVIF^(1/(2*Df))
prop_type_simplified     1.828539  4        1.078358
number_of_reviews        1.443470  1        1.201445
review_scores_rating     1.024933  1        1.012390
room_type                1.994068  2        1.188324
accommodates             1.483677  1        1.218063
host_is_superhost        1.085628  1        1.041935
instant_bookable         1.064740  1        1.031862
neighbourhood_simplified 1.169792  3        1.026482
availability_30          1.049162  1        1.024286
reviews_per_month        1.443963  1        1.201650

3.2 Diagnostics, collinearity, summary tables

As you keep building your models, it makes sense to:

model_final <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified+availability_30+reviews_per_month+beds,data = listings)
#For the final model review_scores_rating is dropped since it was not a significant variables.
#Beds is added to the model and it doesn't create a significant collinearity problem

msummary(model_final)
                                                Estimate Std. Error t value
(Intercept)                                    7.7473387  0.0538902 143.762
prop_type_simplifiedEntire villa               0.2075827  0.0446161   4.653
prop_type_simplifiedFarm stay                 -0.2923000  0.0479006  -6.102
prop_type_simplifiedOther                      0.0371538  0.0392765   0.946
prop_type_simplifiedPrivate room in farm stay -0.6190147  0.0625247  -9.900
number_of_reviews                             -0.0033187  0.0008387  -3.957
room_typePrivate room                         -0.3082612  0.0311559  -9.894
room_typeShared room                          -0.5211982  0.1045836  -4.984
accommodates                                  -0.0208866  0.0038007  -5.495
host_is_superhostTRUE                          0.0745753  0.0237817   3.136
instant_bookableTRUE                           0.0865071  0.0242825   3.563
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun        -0.0403816  0.0341986  -1.181
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing       0.0311728  0.0307198   1.015
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou       0.2470570  0.0292239   8.454
availability_30                                0.0036350  0.0012978   2.801
reviews_per_month                              0.0670759  0.0156150   4.296
beds                                          -0.0090521  0.0042276  -2.141
                                              Pr(>|t|)    
(Intercept)                                    < 2e-16 ***
prop_type_simplifiedEntire villa              3.42e-06 ***
prop_type_simplifiedFarm stay                 1.18e-09 ***
prop_type_simplifiedOther                     0.344247    
prop_type_simplifiedPrivate room in farm stay  < 2e-16 ***
number_of_reviews                             7.76e-05 ***
room_typePrivate room                          < 2e-16 ***
room_typeShared room                          6.59e-07 ***
accommodates                                  4.22e-08 ***
host_is_superhostTRUE                         0.001730 ** 
instant_bookableTRUE                          0.000373 ***
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun        0.237775    
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing      0.310308    
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou       < 2e-16 ***
availability_30                               0.005130 ** 
reviews_per_month                             1.80e-05 ***
beds                                          0.032335 *  

Residual standard error: 0.6046 on 3061 degrees of freedom
  (3172 observations deleted due to missingness)
Multiple R-squared:  0.1939,    Adjusted R-squared:  0.1897 
F-statistic: 46.01 on 16 and 3061 DF,  p-value: < 2.2e-16
model_final %>% 
  car::vif(model_final)
                             GVIF Df GVIF^(1/(2*Df))
prop_type_simplified     1.840573  4        1.079243
number_of_reviews        1.443587  1        1.201494
room_type                2.036250  2        1.194559
accommodates             3.043999  1        1.744706
host_is_superhost        1.073473  1        1.036085
instant_bookable         1.060213  1        1.029667
neighbourhood_simplified 1.171404  3        1.026718
availability_30          1.049500  1        1.024451
reviews_per_month        1.446779  1        1.202821
beds                     2.444336  1        1.563437
  1. Check the residuals, using autoplot(model_x)
#Final Model
model_final %>% 
  car::vif(model_final)
                             GVIF Df GVIF^(1/(2*Df))
prop_type_simplified     1.840573  4        1.079243
number_of_reviews        1.443587  1        1.201494
room_type                2.036250  2        1.194559
accommodates             3.043999  1        1.744706
host_is_superhost        1.073473  1        1.036085
instant_bookable         1.060213  1        1.029667
neighbourhood_simplified 1.171404  3        1.026718
availability_30          1.049500  1        1.024451
reviews_per_month        1.446779  1        1.202821
beds                     2.444336  1        1.563437
autoplot(model_final)

The residual vs Fitted graph doesn’t seem to follow a specific pattern. This means that linearity assumption of regression is checked. The normality graph, while not perfect, seems to fit the normality assumption.

model_final %>% 
  car::vif(model_final) 
                             GVIF Df GVIF^(1/(2*Df))
prop_type_simplified     1.840573  4        1.079243
number_of_reviews        1.443587  1        1.201494
room_type                2.036250  2        1.194559
accommodates             3.043999  1        1.744706
host_is_superhost        1.073473  1        1.036085
instant_bookable         1.060213  1        1.029667
neighbourhood_simplified 1.171404  3        1.026718
availability_30          1.049500  1        1.024451
reviews_per_month        1.446779  1        1.202821
beds                     2.444336  1        1.563437

Collinearity is checked at every stage of explanatory variable addition and no significant collinearity problem exists in the data set since all VIF values are significantly below 5.

  1. We created a summary table, using huxtable:
library(huxtable)

huxreg(model1, model2, model_4,model_5,model_6,model_7,model_final)
(1)(2)(3)(4)(5)(6)(7)
(Intercept)7.692 ***7.701 ***7.940 ***7.887 ***7.838 ***7.751 ***7.747 ***
(0.064)   (0.064)   (0.068)   (0.069)   (0.070)   (0.073)   (0.054)   
prop_type_simplifiedEntire villa0.147 ** 0.147 ** 0.175 ***0.173 ***0.211 ***0.204 ***0.208 ***
(0.046)   (0.046)   (0.045)   (0.045)   (0.045)   (0.045)   (0.045)   
prop_type_simplifiedFarm stay-0.298 ***-0.298 ***-0.309 ***-0.309 ***-0.300 ***-0.292 ***-0.292 ***
(0.050)   (0.050)   (0.049)   (0.049)   (0.048)   (0.048)   (0.048)   
prop_type_simplifiedOther-0.042    0.072    0.023    0.024    0.037    0.038    0.037    
(0.037)   (0.040)   (0.040)   (0.040)   (0.039)   (0.039)   (0.039)   
prop_type_simplifiedPrivate room in farm stay-0.858 ***-0.656 ***-0.675 ***-0.665 ***-0.640 ***-0.628 ***-0.619 ***
(0.057)   (0.064)   (0.063)   (0.063)   (0.063)   (0.063)   (0.063)   
number_of_reviews-0.001    -0.001    -0.002 *  -0.002 *  -0.002 *  -0.003 ***-0.003 ***
(0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   
review_scores_rating0.013    0.010    0.006    0.003    0.002    0.001            
(0.012)   (0.011)   (0.011)   (0.011)   (0.011)   (0.011)           
room_typePrivate room        -0.201 ***-0.301 ***-0.300 ***-0.314 ***-0.312 ***-0.308 ***
        (0.030)   (0.031)   (0.031)   (0.031)   (0.031)   (0.031)   
room_typeShared room        -0.459 ***-0.566 ***-0.579 ***-0.544 ***-0.551 ***-0.521 ***
        (0.105)   (0.105)   (0.105)   (0.104)   (0.104)   (0.105)   
accommodates                -0.023 ***-0.024 ***-0.027 ***-0.027 ***-0.021 ***
                (0.003)   (0.003)   (0.003)   (0.003)   (0.004)   
host_is_superhostTRUE                0.084 ***0.073 ** 0.077 ** 0.074 ** 0.075 ** 
                (0.024)   (0.024)   (0.024)   (0.024)   (0.024)   
instant_bookableTRUE                        0.103 ***0.105 ***0.085 ***0.087 ***
                        (0.024)   (0.024)   (0.024)   (0.024)   
neighbourhood_simplifiedåÆ†äŗ‘åŽæ / Miyun                                -0.049    -0.040    -0.040    
                                (0.034)   (0.034)   (0.034)   
neighbourhood_simplifiedå»¶åŗ†åŽæ / Yanqing                                0.020    0.032    0.031    
                                (0.031)   (0.031)   (0.031)   
neighbourhood_simplifiedę€€ęŸ”åŒŗ / Huairou                                0.236 ***0.251 ***0.247 ***
                                (0.029)   (0.029)   (0.029)   
availability_30                                        0.004 ** 0.004 ** 
                                        (0.001)   (0.001)   
reviews_per_month                                        0.069 ***0.067 ***
                                        (0.016)   (0.016)   
beds                                                -0.009 *  
                                                (0.004)   
N3081        3081        3081        3081        3081        3081        3078        
R20.112    0.129    0.154    0.159    0.185    0.192    0.194    
logLik-2963.099    -2934.665    -2888.684    -2879.701    -2831.922    -2818.295    -2809.926    
AIC5942.197    5889.330    5801.368    5785.402    5695.844    5672.589    5655.853    
*** p < 0.001; ** p < 0.01; * p < 0.05.
  1. Testing the predictability of our model:
filtered_listings <- listings %>% 
  filter(room_type=="Private room") %>% 
  filter(number_of_reviews>= 10) %>% 
  filter(review_scores_rating >= 4.5)  #the data set is filtered. Filtered dataframe has 161 obs.
  
  
predicted <- data.frame(exp(predict(model_final,filtered_listings,interval = "prediction" ))) #cost and intervals are predicted

predicted %>% 
  summarise(avg_price_4_nights = mean(fit),upper=mean(upr),lower=mean(lwr)) #average of fit, upr and lower
avg_price_4_nightsupperlower
1.95e+036.41e+03592
predicted$ID <- seq.int(nrow(predicted))
filtered_listings$ID <- seq.int((nrow(filtered_listings)))

filtered_listings_ordered <- filtered_listings %>% 
  left_join(predicted,by="ID") %>% 
  arrange(desc(fit))

filtered_listings_ordered$ID <- seq.int((nrow(filtered_listings_ordered)))


#a line graph is created to obeserve how well the data is predicted. Prices above 10000 are filtered since two values were outliers 

filtered_listings_ordered %>% 
  filter(upr<10000) %>% 
  ggplot(aes(x=ID))+
  geom_line(aes(y=fit,col="Fitted Line"))+
  geom_line(aes(y=price_4_nights,col="Actal Price"))+
  geom_line(aes(y=upr))+
  geom_line(aes(y=lwr))+
  xlab("Observations")+
  ylab("Price for 4 Nights")+
  theme_minimal()

Utku Odabasi ChloƩ Baubier Jiacheng Zhu Jay Bensal Mengtian Li Yaxin Liu